*===============================================================================
*=== FILE 7 --- FINALIZE FOR-SALE DATA 
*===============================================================================
clear all
set more off

cap cd ""


* Convert
import delimited "data\pseudo\propTrans.txt", delimiter("||", collapse) clear
save "replication output\transaction_data", replace

* fix column shift, rename variable, label
rename transactionid transactionid_
rename zillowpropertyid zillowpropertyid_
rename countyregionid countyregionid_
rename datasourcetype datasourcetype_
rename bestrecordflag bestrecordflag_
rename saledollarcnt saledollarcnt_
rename transdatekey transdatekey_
rename loanvaluedollarcnt loanvaluedollarcnt_
rename isarmslength isarmslength_
rename v10 v10_

rename transactionid_ obs
rename zillowpropertyid_ transactionid
rename countyregionid_ propertyid
rename datasourcetype_ countyregionid
rename bestrecordflag_ datasourcetype
rename saledollarcnt_ bestrecordflag
rename transdatekey_ transactionprice
rename loanvaluedollarcnt_ transdatekey
rename isarmslength_ loanvaluedollarcnt
rename v10_ isarmslength

label variable obs "observation"
label variable transactionid "TransactionID"
label variable propertyid "PropertyID"
label variable countyregionid "CountyRegionID"
label variable datasourcetype "DataSource"
label variable bestrecordflag "BestRecordFlag"
label variable transactionprice "TransactionPrice"
label variable transdatekey "TransactionUNIX"
label variable loanvaluedollarcnt "LoanValue"
label variable isarmslength "IsArmLength"

* convert date
gen transactiondate = transdatekey - td(01jan1950)
format transactiondate %td

label variable transactiondate "TransactionDate"

* drop now irrelevant variables
drop obs bestrecordflag transdatekey datasourcetype

* drop nonsensical observations
drop if propertyid < 0

drop if transactionprice < 15000 | transactionprice > 5000000

drop if isarmslength == "0" | isarmslength == "NA"
drop isarmslength

* destring
destring loanvalue, replace force

	
merge m:m countyregionid using "replication output\regionid_statename", keepusing(statename) keep(match)
drop _merge

gen county = string(countyregionid)

rename statename state
label variable state "State"

label variable county "County"

save "replication output\transaction_data_cleaned", replace


merge m:m propertyid using "data\preprocessed\pseudo\PropFacts"

drop if _merge == 2
drop _merge

merge m:m propertyid using "data\pseudo\uw6ft"

drop if _merge == 2
drop _merge

duplicates drop transactionid, force




* clean to prepare for merge
*destring postalcode, gen(postalcode_int) force

cap gen listing_year = year(listing_date)
replace listing_year = year(transactiondate) if listing_year == .

cap gen listingquarter = listing_date
replace listingquarter = transactiondate if listingquarter == .

format listingquarter %td

cap gen listing_quarter = qofd(listingquarter)
format listing_quarter %tq

drop listingquarter

* merge
merge m:m state listing_quarter using "data/preprocessed/actual/state_index_quarterly"
drop _merge

merge m:m state listing_year using "data/preprocessed/actual/state_index_yearly"
drop _merge

merge m:m postalcode_int listing_quarter using "data/preprocessed/actual/zipcode_index_quarterly"
drop if _merge == 2
drop _merge

merge m:m postalcode_int listing_year using "data/preprocessed/actual/zipcode_index_yearly"
drop if _merge == 2
drop _merge

* clean and label
drop listing_year listing_quarter

label variable uw6ft "UnderWater6ft"
*label variable sellingprice "ListingPrice"
label variable listing_date "ListingDate"





* create dates
gen transaction_year = year(transactiondate)

gen transactionquarter = transactiondate
format transactionquarter %td

gen transaction_quarter = qofd(transactionquarter)
format transaction_quarter %tq
drop transactionquarter

label variable transaction_quarter "TransactionQuarter"

* drop irrelevant observations
drop if transaction_year < 2008

drop if postalcode_int == . | postalcode_int == 0

* generate new variables
gen  ln_transaction_price = ln(transactionprice)
label variable ln_transaction_price "ln(TransactionPrice)"

destring(bedroomcnt bathroomcnt), replace force
replace bedroomcnt = 6 if bedroomcnt > 6 & bedroomcnt != .
replace bedroomcnt = 99 if bedroomcnt == .

replace bathroomcnt = 7 if bathroomcnt > 7 & bathroomcnt != .
replace bathroomcnt = 99 if bathroomcnt == .

replace bathroomcnt = round(bathroomcnt, 0.5)


destring(finishedsq lotsizesquarefeet yearbuilt majorremodelyear), replace force

xtile lotsizesquarefeet_pct = lotsizesquarefeet, n(10)
replace lotsizesquarefeet_pct = 99 if lotsizesquarefeet_pct == .
drop lotsizesquarefeet

destring finishedsquare, replace force
xtile finishedsquarefeet_pct = finishedsquarefeet, n(20)
replace finishedsquarefeet_pct = 99 if finishedsquarefeet_pct == .
drop finishedsquarefeet

gen     prop_age = transaction_year - yearbuilt
replace prop_age = 0 if prop_age < 0
replace prop_age = 98 if prop_age > 98 & prop_age != .
replace prop_age = 99 if prop_age == .
label variable prop_age "PropertyAge"

egen prop_age_group = cut(prop_age), at(0,5,10,15,20,25,30,35,40,45,50,55,60,65,70,99)
replace prop_age_group = 99 if prop_age_group == .
label variable prop_age_group "PropertyAge_grouped"

gen remodel_age = transaction_year - majorremodelyear
replace remodel_age = prop_age if prop_age < remodel_age & remodel_age != . & yearbuilt != .
replace remodel_age = prop_age if remodel_age == . & yearbuilt != .
replace remodel_age = 0 if remodel_age < 0
replace remodel_age = 98 if remodel_age > 98 & remodel_age != .
replace remodel_age = 99 if remodel_age == .
label variable remodel_age "YearsSinceMajorRemodel"

egen remodel_age_group = cut(remodel_age), at(0,5,10,15,20,25,30,35,40,45,50,55,60,65,70,99)
replace remodel_age_group = 99 if remodel_age_group == .
label variable remodel_age_group "YearsSinceMajorRemodel_grouped"

* create fixed effects
egen city_quarter_fe = group(city transaction_quarter state)
egen zip_quarter_fe  = group(postalcode transaction_quarter state)
egen county_quarter_fe = group(county transaction_quarter state)

*cluster
egen county_year = group(county transaction_year)
egen zip_year = group(postalcode transaction_year)
egen zip_quarter = group(postalcode transaction_quarter)

* create interactions
gen interaction5 = uw6ft * index_state_year
gen interaction6 = uw6ft * index_state_quarter
gen interaction7 = uw6ft * index_zipcode_year
gen interaction8 = uw6ft * index_zipcode_quarter

* label
label variable uw6ft "flood_zone"

label variable index_zipcode_quarter "index_zipcode_quarter"
label variable index_zipcode_year "index_zipcode_year"

label variable  interaction5 "flood_zone*index_state_year"
label variable  interaction6 "flood_zone*index_state_quarter"
label variable  interaction7 "flood_zone*index_zipcode_year"
label variable  interaction8 "flood_zone*index_zipcode_quarter"

compress

save "replication output/transaction_regression_data", replace
